1 Plotting function

    # plotting function
    plt_avg_expr_per_fou_by_gn = function(data) {
        # calculate number of GN datasets per gene to be plotted
        n_gn_per_gene = data %>% distinct(GN, gene_name) %>% count(gene_name)

        # make individual panels
        p_list = data %>%
            mutate(panel_id = gene_name) %>%
            nest(data = !panel_id) %>%
            mutate(p = map(data, function(.data) {
                # center y-axis for each dataset, because expression values are dataset dependent
                # comparing different studies
                if (1) {
                    .data = .data %>%
                        group_by(GN) %>%
                        mutate(expr_val = scale(expr_val, center = TRUE, scale = FALSE)[,1]) %>%
                        ungroup
                }

                # only want "B" and "D" genotypes
                .data = .data %>% 
                    filter(gt %in% c('0/0', '1/1')) %>%
                    mutate(gt = recode(gt, `0/0` = 'B', `1/1` = 'D')) 
                
                # dataset ordering by difference b/w B and D
                dset_lvls = .data %>% 
                    group_by(GN, gt) %>%
                    summarise(expr_val = mean(expr_val), .groups = 'drop') %>%
                    pivot_wider(id_cols = GN, names_from = gt, values_from = expr_val) %>%
                    mutate(diff = B - D) %>%
                    arrange(desc(diff)) %>% pull(GN) %>% as.character
                
                # add an "all" group
                .data = bind_rows(.data, .data %>% mutate(GN = 'all'))

                # add level for the "all" group
                dset_lvls = c(dset_lvls, 'all')
                
                # form plot
                p = .data %>%
                    mutate(GN = fct_relevel(GN, dset_lvls)) %>%
                    ggplot(aes(GN, expr_val, color = gt)) +
                    geom_boxplot(
                                 # outlier.shape = NA, 
                                 fill = NA,
                                 width = 0.5,
                                 position = position_dodge(width = 0.9)) + 
                    scale_color_brewer(palette = 'Set1') + 
                    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
                    scale_y_continuous(guide = guide_axis(check.overlap = TRUE)) +
                    facet_grid(cols = vars(gene_name), scales = 'free_x', space = 'free_x') + 
                    theme_half_open() +
                    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
                          axis.title.x = element_blank(),
                          legend.position = 'none',
                          plot.title = element_text(size = 10, hjust = 0.5)) +
                    labs(y = 'Gene expression')
                p
            }))
        p_list
        # p_list = p_list$p %>% set_names(p_list$panel_id)
        # p = plot_grid(
        #   plot_grid(p_list$Msh3, p_list$Xrcc4, nrow = 1),
        #   p_list$Ssbp2, p_list$Atg10, ncol = 1
        # )
        # p_list = p_list %>% left_join(n_gn_per_gene, by = c('panel_id' = 'gene_name'))
        # p_list = p_list %>% mutate(n = n/max(n) + 0.3) 
        # p = plot_grid(plotlist = p_list$p, nrow = 1, rel_widths = p_list$n)
        # p
    }
    
    plot_signif_by_probe = function(all_expr_vals, probe_info, probe_max, gene, n_panel = 2) {
        to_plt = all_expr_vals %>%
            filter(gene_name == gene) %>%
            # number probes
            left_join(probe_info %>% select(probe, probe_pos), by = 'probe') %>%
            group_by(gene_name, GN) %>%
            # order by probe position
            arrange(probe_pos, .by_group = TRUE) %>%
            mutate(probe_num = probe %>% as.character %>% fct_inorder %>% as.integer %>% as.factor) %>%
            ungroup

        # add "is_eqtl" label
        to_plt = to_plt %>%
            left_join(probe_max %>% select(GN, gene_name, probe, is_eqtl), by = c('GN', 'gene_name', 'probe')) %>%
            mutate(is_eqtl = replace_na(is_eqtl, FALSE))

        # group into panels
        panel_ids = to_plt %>%
            distinct(GN) %>%
            mutate(panel_id = cut_number(GN %>% as.factor %>% as.integer, n = n_panel, label = FALSE))

        # make plot list
        p_list = to_plt %>%
            left_join(panel_ids, by = 'GN') %>%
            nest(data = !panel_id) %>%
            mutate(p = map(data, function(.x) {
                .x %>%
                ggplot(aes(probe_num, expr_val, color = is_eqtl)) +
                geom_boxplot(outlier.shape = '.') + 
                facet_grid(~GN, scales = 'free_x', space = 'free') +
                scale_x_discrete(guide = guide_axis(check.overlap = TRUE, n.dodge = 1)) +
                # scale_color_brewer(palette = 'Set1') + 
                scale_color_manual(values = c(`FALSE` = '#377eb8', `TRUE` = '#4daf4a')) + 
                theme_half_open() +
                theme(legend.position = 'none',
                      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
                      # axis.text.x = element_text(size = 8),
                      strip.text = element_text(angle = 90),
                      panel.spacing = unit(2, 'pt'),
                      text = element_text(size = 8)
                )
            }))
        p = plot_grid(plotlist = p_list$p, ncol = 1)
        p
    }
    redo = TRUE

2 QTL region

    ci_chr = '13'; ci_lo = 83.78112; ci_hi = 93.41913; ci_mid = 90.4

3 Other configs

    # how many top genes to take
    lod_thresh = 3
    top_x_genes = 10
    other_lvl = str_c('>', top_x_genes)
    
    # define DNA repair genes
    dna_repair_genes = c('Msh3', 'Xrcc4', 'Ssbp2', 'Atg10', 'Ccnh') 

4 Load data

  1. 54 representative datasets
  2. filtered datasets that have >30 strains
  3. common protein coding genes
    # cached data from eqtl analysis    
    cache_dir = '../data/analysis_cache'
    out_file = path(cache_dir, 'eqtl_data.rds')
    eqtl_data = readRDS(out_file)
    
    # gene table
    gn_table = readRDS('../data/analysis_cache/final_gn_table.rds') 

    # extract objects
    # best_trace      = eqtl_data$best_trace # want to recalculate best_trace (probes w/ * w/o snps)
    # best_point      = eqtl_data$best_point # want to recalculate best_point (probes w/ * w/o snps)
    sel_dsets         = eqtl_data$sel_dsets
    common_prot_genes = eqtl_data$common_prot_genes
    qtl_dsets         = eqtl_data$qtl_dsets
    eqtl_dsets_genes  = eqtl_data$eqtl_dsets_genes
    gene_ord          = eqtl_data$gene_ord
    probe_max         = eqtl_data$probe_max
    gene_pal          = eqtl_data$gene_pal
    signal_signal_cor = eqtl_data$signal_signal_cor

    # % expanded QTL mapping results 
    cache_dir = '../data/analysis_cache/bxd_qtl_scans'
    cache_file = path(cache_dir, 'final_qtl.rds')
    final_res = readRDS(cache_file)

    # gene expression data
    all_trace     = readRDS('../data/gene_expr/qtl_agg/gene_expr_db.rds')
    all_expr_vals = all_trace$expr_vals
    probe_info    = all_trace$probes

    # filtered for representative datasets and common genes
    # 54 representative datasets already accounted for
    # only use 
    all_expr_vals = all_expr_vals %>%
        filter(gene_name %in% common_prot_genes,
               GN %in% sel_dsets) %>%
        # filter(GN %in% sel_dsets) %>% # redundant
        semi_join(qtl_dsets, by = 'GN')
    
    # probes with no snps version
    all_expr_vals_w_snp = all_expr_vals # make a backup
    all_expr_vals = all_expr_vals %>%
        semi_join(probe_info %>% filter(n_var_per_probe == 0), by = 'probe')
    
    # best point within QTL sized region around gene
    get_best_point = function(data) {
        ci_hwind = (ci_hi - ci_lo)/2
        data %>%
            left_join(all_trace$genes, by = 'gene_name') %>%
            left_join(all_trace$markers, by = 'marker') %>%
            arrange(desc(LOD)) %>%
            mutate(across(c(gene_pos, gene_end), ~.x/1e6)) %>%
            filter(mark_pos >= (gene_end+gene_pos)/2 - ci_hwind, mark_pos <= (gene_end+gene_pos)/2 + ci_hwind) %>%
            distinct(GN, gene_name, mark_chr, .keep_all = TRUE) %>%
            mutate(across(where(is.factor), as.character))
    }
    best_point = list(
        all_probes = all_trace$signal %>% get_best_point,
        no_snps = all_trace$signal %>%
            semi_join(probe_info %>% filter(n_var_per_probe == 0), by = 'probe') %>%
            get_best_point
    )
    redo = TRUE

5 Aggregate gene expression data

3 ways of calculating a single expression value per GN/gene/strain

  1. Average expression per gene
  2. Probe with highest average expression
  3. Probe with best QTL signal
    # three versions of GN,gene,strain,expr_val dataframes
    unq_expr_vals = list(
        avg_expr_per_gene = all_expr_vals %>%
            semi_join(probe_info %>% distinct(probe, n_var_per_probe) %>% filter(n_var_per_probe == 0), by = 'probe') %>%
            group_by(GN, gene_name, strain) %>%
            summarise(expr_val = mean(expr_val), .groups = 'drop'),
        top_expr_probe = {
            max_expr_probes = all_expr_vals %>%
                semi_join(probe_info %>% distinct(probe, n_var_per_probe) %>% filter(n_var_per_probe == 0), by = 'probe') %>%
                group_by(GN, gene_name, probe) %>%
                summarise(avg_expr = mean(expr_val), .groups = 'drop') %>%
                group_by(GN, gene_name) %>%
                slice_max(n = 1, order_by = avg_expr, with_ties = FALSE) %>% 
                ungroup 
            all_expr_vals %>%
                semi_join(probe_info %>% distinct(probe, n_var_per_probe) %>% filter(n_var_per_probe == 0), by = 'probe') %>%
                semi_join(max_expr_probes, by = c('GN', 'gene_name', 'probe'))
        },
        top_qtl_probe = all_expr_vals %>%
            semi_join(probe_info %>% distinct(probe, n_var_per_probe) %>% filter(n_var_per_probe == 0), by = 'probe') %>%
            semi_join(best_point$no_snps %>% distinct(GN, probe, gene_name), by = c('GN', 'probe', 'gene_name'))
    )

    # same but using all probes (even ones with snps)
    # checked that this looks similar to so need to replot, either way it is correct to not use probes with snps for comparing expression data
    # we don't know how to scale expression with number of snps
    if (0) {
        unq_expr_vals_w_snp = list(
            avg_expr_per_gene = all_expr_vals_w_snp %>%
                group_by(GN, gene_name, strain) %>%
                summarise(expr_val = mean(expr_val), .groups = 'drop'),
            top_expr_probe = {
                max_expr_probes = all_expr_vals_w_snp %>%
                    group_by(GN, gene_name, probe) %>%
                    summarise(avg_expr = mean(expr_val), .groups = 'drop') %>%
                    group_by(GN, gene_name) %>%
                    slice_max(n = 1, order_by = avg_expr, with_ties = FALSE) %>% 
                    ungroup 
                all_expr_vals_w_snp %>%
                    semi_join(max_expr_probes, by = c('GN', 'gene_name', 'probe'))
            },
            top_qtl_probe = all_expr_vals_w_snp %>%
                semi_join(best_point$all_probes %>% distinct(GN, probe, gene_name), by = c('GN', 'probe', 'gene_name'))
        )
    }

    # check number of rows for each list
    # unq_expr_vals %>% map(nrow)

6 Overall gene expression levels of DNA repair genes

  1. Ssbp2 and Ccnh are more highly expressed than the other genes
  2. High variability
  3. Distribution within each boxplot can be multi-modal, because of differences in probes
    to_plt = all_expr_vals %>% 
        filter(gene_name %in% c('Msh3', 'Xrcc4', 'Ssbp2', 'Atg10', 'Ccnh'))

    p = to_plt %>%
        ggplot(aes(GN, expr_val, color = gene_name)) +
        geom_boxplot(outlier.shape = '.') + 
        geom_hline(data = ~.x %>% group_by(gene_name) %>% summarise(avg_expr_val = mean(expr_val)),
                   aes(yintercept = avg_expr_val)) + 
        scale_color_manual(values = gene_pal) + 
        scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) + 
        facet_wrap(~gene_name, nrow = 1) +
        theme_half_open() + 
        theme(
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)
            # axis.text.x = element_blank(),
            # axis.ticks.x = element_blank()
        )
    p

    ggsave('test.pdf', p, w = 10, h = 4)

7 Overall gene expression levels for all genes

    to_plt = all_expr_vals

    p = to_plt %>%
        group_by(gene_name) %>%
        mutate(avg_expr_val = median(expr_val)) %>%
        ungroup %>%
        mutate(gene_name = fct_reorder(gene_name, desc(avg_expr_val))) %>%
        ggplot(aes(GN, expr_val, color = gene_name)) +
        geom_boxplot(outlier.shape = '.') + 
        geom_hline(data = ~.x %>% distinct(gene_name, avg_expr_val),
                   aes(yintercept = avg_expr_val)) + 
        geom_hline(yintercept = 8, linetype = 'dashed') + 
        scale_color_manual(values = gene_pal) + 
        scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) + 
        facet_wrap(~gene_name, ncol = 7) +
        coord_cartesian(ylim = c(0, 20)) + 
        theme_half_open() + 
        theme(
              # axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
              legend.position = 'none',
              axis.text.x = element_blank(),
              axis.ticks.x = element_blank()
        )
    p

    ggsave('test.pdf', p, w = 10, h = 10)

    # figure print
    ggsave(path(plot_dir, 'Suppl_Fig_8.pdf'), p, w = 10, h = 10)

8 Compare DNA repair gene levels within each dataset (by tissue)

  1. Compare only among datasets where levels available for all genes
  2. Expression levels similar in some datasets but not in others where Ccnh and Ssbp2 are over-expressed
    to_plt = all_expr_vals %>%
        filter(gene_name %in% c('Msh3', 'Xrcc4', 'Ssbp2', 'Atg10', 'Ccnh')) %>%
        # keep only GNs were all genes appear
        group_by(GN) %>%
        mutate(n_gene = gene_name %>% unique %>% length) %>%
        ungroup %>%
        filter(n_gene == 5)

    # join tissue
    to_plt = to_plt %>%
        left_join(gn_table %>% 
            select(tissue, GN = sel_dset) %>%
            group_by(tissue) %>%
            mutate(tissue_num = 1:n()) %>%
            mutate(tissue_id = if_else(tissue_num != 1, str_c(tissue, '_', tissue_num), tissue)), by = 'GN')

    # sort so that tissues with greatest difference b/w genes are first
    gn_ord = to_plt %>%
        group_by(GN, gene_name) %>%
        summarise(avg_gene_expr = median(expr_val), .groups = 'drop') %>%
        group_by(GN) %>%
        summarise(gn_sd = sd(avg_gene_expr), .groups = 'drop') %>%
        mutate(GN = fct_reorder(GN, desc(gn_sd)))


    # gen plot
    p = to_plt %>%
        mutate(GN = fct_relevel(GN, gn_ord %>% pull(GN) %>% levels)) %>%
        ggplot(aes(tissue_id, expr_val, color = gene_name)) +
        geom_boxplot(outlier.shape = '.') + 
        facet_wrap(~GN, nrow = 2, scales = 'free_x') + 
        scale_color_manual(values = gene_pal) + 
        theme_half_open() + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
              strip.text = element_blank(),
              panel.spacing = unit(2, 'pt')
        )
    p

    ggsave('test.pdf', p, w = 8, h = 6)

9 Gene expression by tissue ALIGNED VERSION

  1. Compare among only datasets where levels available for all genes
  2. No obvious overlap b/w tissues where these genes are expressed high or low
  3. NOTE 02/04/22: deprecated
    to_plt = all_expr_vals %>%
        filter(gene_name %in% c('Msh3', 'Xrcc4', 'Ssbp2', 'Atg10', 'Ccnh')) %>%
        # keep only GNs were all genes appear
        group_by(GN) %>%
        mutate(n_gene = gene_name %>% unique %>% length) %>%
        ungroup %>%
        filter(n_gene == 5)

    # generate plot
    p = to_plt %>%
        left_join(gn_table %>% 
            select(tissue, GN = sel_dset) %>%
            group_by(tissue) %>%
            mutate(tissue_num = 1:n()) %>%
            mutate(tissue_id = if_else(tissue_num != 1, str_c(tissue, '_', tissue_num), tissue)), by = 'GN') %>%
        group_by(GN) %>% # this is across all genes
        mutate(avg_expr = mean(expr_val)) %>%
        ungroup %>%
        mutate(tissue_id = fct_reorder(tissue_id, desc(avg_expr))) %>%
        ggplot(aes(tissue_id, expr_val, color = gene_name)) +
        geom_boxplot(outlier.shape = '.') + 
        geom_hline(yintercept = 8) + 
        facet_grid(rows = vars(gene_name), scales = 'free_y') + 
        scale_color_manual(values = gene_pal) + 
        theme_half_open() + 
        theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1, vjust = 1),
              # strip.text = element_blank(),
              axis.title.x = element_blank(),
              panel.border = element_rect(color = 'black', linetype = 'dashed')
        )
    # p
    # ggsave('test.pdf', p, w = 6, h = 8)

10 eQTL vs. non-eQTL probes per gene

10.1 Msh3

    p = plot_signif_by_probe(all_expr_vals, probe_info, probe_max, 'Msh3', n_panel = 2)
    p

    ggsave('test.pdf', p, w = 12, h = 8)
    redo = TRUE

10.2 Ssbp2

    p = plot_signif_by_probe(all_expr_vals, probe_info, probe_max, 'Ssbp2', n_panel = 2)
    p

    ggsave('test.pdf', p, w = 12, h = 8)
    redo = TRUE

10.3 Atg10

    p = plot_signif_by_probe(all_expr_vals, probe_info, probe_max, 'Atg10', n_panel = 2)
    p

    ggsave('test.pdf', p, w = 12, h = 8)
    redo = TRUE

10.4 Single dataset for DNA repair genes

    gn = 'GN163'
    to_plt = all_expr_vals %>%
        filter(gene_name %in% dna_repair_genes) %>%
        filter(GN == gn) %>%
        # number probes
        left_join(probe_info %>% select(probe, probe_pos), by = 'probe') %>%
        group_by(gene_name, GN) %>%
        # order by probe position
        arrange(probe_pos, .by_group = TRUE) %>%
        mutate(probe_num = probe %>% as.character %>% fct_inorder %>% as.integer %>% as.factor) %>%
        ungroup

    # add "is_eqtl" label
    to_plt = to_plt %>%
        left_join(probe_max %>% select(GN, gene_name, probe, is_eqtl), by = c('GN', 'gene_name', 'probe')) %>%
        mutate(is_eqtl = replace_na(is_eqtl, FALSE))

    # make plot list
    p = to_plt %>%
        ggplot(aes(probe_num, expr_val, color = is_eqtl)) +
        geom_boxplot(outlier.shape = '.') + 
        facet_wrap(~gene_name, scales = 'free_x') +
        scale_x_discrete(guide = guide_axis(check.overlap = TRUE, n.dodge = 1)) +
        # scale_color_brewer(palette = 'Set1') + 
        scale_color_manual(values = c(`FALSE` = '#377eb8', `TRUE` = '#4daf4a')) + 
        theme_half_open() +
        theme(legend.position = 'right',
              axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
              # axis.text.x = element_text(size = 8),
              # strip.text = element_text(angle = 90),
              panel.spacing = unit(2, 'pt'),
              plot.title = element_text(hjust = 0.5, size = 10)
              # text = element_text(size = 8)
        ) + labs(title = gn)
    p

    ggsave('test.pdf', p, w = 12, h = 8)

11 Loci of interest

    # founder genotypes
    bxd_mark_gts = maxmarg(BXDstrs::qtl_data$snp_probs[,'13'], minprob = 0.5) %>% 
        .[[1]] %>%
        as.data.frame %>%
        rownames_to_column(var = 'strain') %>%
        pivot_longer(cols = !strain, names_to = 'marker', values_to = 'fou_gt')

    # select marker at peak
    vcfs = list(snp = '../data/vep_annot/bxd_snp_indel.annot.vcf.gz', sv = '../data/vep_annot/bxd_svs.annot.vcf.gz')
    loci_of_int = list(
        qtl_peak = final_res$qtl_res %>% 
            filter(metric == '% expanded') %>%
            unite('marker', c('chr', 'pos', 'end')) %>%
            slice_max(LOD, n = 1) %>%
            select(locus = marker) %>%
            mutate(vcf = vcfs[['snp']]),
        closest_to_ci_center = bxd_mark_gts %>%
            distinct(marker) %>%
            separate('marker', c('chr', 'pos', 'end'), convert = TRUE, remove = FALSE) %>%
            mutate(dist_to_peak = abs(pos - ci_mid*1e6)) %>%
            slice_min(n = 1, order_by = dist_to_peak, with_ties = FALSE) %>%
            select(locus = marker) %>%
            mutate(vcf = vcfs[['snp']]),
        atg10_variant    = tibble(locus = 'chr13_91154245_91154245', vcf = vcfs[['snp']]),
        te_insertion     = tibble(locus = 'chr13_92348038_92348424', vcf = vcfs[['sv']]),
        msh3_splice_var1 = tibble(locus = 'chr13_92348451_92348451', vcf = vcfs[['snp']]),
        msh3_splice_var2 = tibble(locus = 'chr13_92348452_92348452', vcf = vcfs[['snp']])
        # this cryptic frame shift is most likely an artifact
        # cryptic_frame_shift = tibble(locus = 'chr13_92348043_92348043', vcf = vcfs[['snp']])
    ) %>% map_df(~.x, .id = 'loc')
    loci_of_int
    # get genotypes at marker
    if (0) {
        # subset genotypes
        gts_of_int = bxd_mark_gts %>%
            mutate(fou_gt = recode(fou_gt, `1` = 'B', `2` = 'D'),
                   fou_gt = replace_na(fou_gt, 'miss')) %>%
            semi_join(loci_of_int, by = c('marker' = 'locus'))

        # check for markers
        gts_of_int %>% count(marker)
        # marker                      n
        # chr13_90420704_90420704   152
        # chr13_90440358_90440358   152
        # problem is that the atg10 variant is not included in the marker list using for QTL mapping
    } else {
        # debug
        # bcftools view ../data/vep_annot/bxd_svs.annot.vcf.gz chr13:92348030-92348434 | less -S
        
        # get raw genotypes from vcf
        # .x = 'chr13:92348038-92348424'; .y = '../data/vep_annot/bxd_svs.annot.vcf.gz'
        gts_of_int = loci_of_int %>% 
            separate('locus', c('chr', 'pos', 'end'), remove = FALSE) %>%
            rowwise %>%
            mutate(data = map2(sprintf('%s:%s-%s', chr, pos, end), vcf, function(.x, .y) {
                cmd = sprintf("bcftools query -f '[%%CHROM\t%%POS\t%%END\t%%REF\t%%ALT\t%%SAMPLE\t%%GT\t%%TGT\n]' %s -r %s", 
                              .y, 
                              .x)
                read_tsv(pipe(cmd), 
                         col_names = c('chr', 'pos', 'end', 'ref', 'alt', 'short_name', 'gt', 'tgt'),
                         col_types = cols(chr = 'c', pos = 'i', end = 'i', ref = 'c', alt = 'c', short_name = 'c', gt = 'c', tgt = 'c'))
            })) %>%
            ungroup
        gts_of_int = gts_of_int %>% select(loc, data) %>% unnest(data)
        
        # check genotypes
        # gts_of_int %>% pull(tgt) %>% unique

        # join real strain names
        gts_of_int = gts_of_int %>%
            left_join(BXDstrs::strain_info %>% select(short_name, bxd_id), by = 'short_name') %>%
            mutate(bxd_id = if_else(loc == 'te_insertion', short_name, bxd_id)) %>%
            filter(!is.na(bxd_id)) %>%
            # count(loc)
            select(loc, chr, pos, end, ref, alt, strain = bxd_id, gt, tgt) %>%
            nest(gt_data = c(strain, gt, tgt))
    }

12 LD b/w loci of interest

  1. Highly correlated, so really need to consider just one
    cor_mat = gts_of_int %>%
        unnest(gt_data) %>%
        separate('gt', c('gta', 'gtb'), convert = TRUE, fill = 'right') %>%
        pivot_wider(id_cols = strain, names_from = loc, values_from = gta) %>%
        column_to_rownames(var = 'strain')
    p = ggcorr(data = cor_mat, method = c('pairwise.complete.obs', 'pearson'), hjust = 0.75, angle = -25, label = TRUE)
    p

    ggsave('test.pdf', w = 6, h = 4)

13 %expanded by haplogroup and haplogroup/epoch

  1. Color by B/D founder haplotype @ different loci of interest
  2. 0/0 is “B” founder haplotype
    pxg = final_res$pheno_vals %>%
        filter(metric == 'proportion_expanded') %>%
        left_join(strain_info %>% 
              mutate(off_epoch = recode(off_epoch, epoch_1b = 'epoch_1a', epoch_1c = 'epoch_1a')) %>%
              select(strain = bxd_id, off_epoch), by = 'strain') %>%
        rename(perc_expand = pheno) %>%
        left_join(gts_of_int %>% unnest(gt_data), by = 'strain') %>%
        filter(loc == 'qtl_peak')

    # by epoch
    pos = position_dodge(width = 0.2)
    p1 = pxg %>%
        filter(gt != '0/1') %>%
        ggplot(aes(off_epoch, perc_expand)) +
        geom_boxplot(aes(color = gt), width = 0.2, position = pos) +
        geom_quasirandom(aes(color = gt), dodge.width = 0.2, width = 0.1, size = 1, position = pos) + 
        geom_text_repel(max.overlaps = 3,
                        aes(label = strain, color = gt),
                        min.segment.length = 0,
                        size = 2.5) +
        geom_text(data = ~.x %>% group_by(off_epoch) %>% summarise(n_avg = mean(n)), 
                  aes(label = sprintf('%0.1f', n_avg), y = 0.85), angle = 0, hjust = 0.5, size = 3) + 
        scale_color_brewer(palette = 'Set1') + 
        # guides(color = guide_legend(title = 'Founder haplogroup')) +
        # facet_wrap(~loc, nrow = 1) + 
        theme_half_open() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
              axis.title.x = element_blank(),
              plot.title = element_text(hjust = 0.5, size = 10),
              legend.position = 'none') +
        labs(y = '% expanded', title = 'By epoch')

    # overall by B/D
    p2 = pxg %>%
        filter(gt != '0/1') %>%
        # mutate(tgt = str_replace(tgt, '/', '/\n')) %>%
        # mutate(tgt = str_c(tgt, ' [', gt, ']')) %>%
        arrange(desc(gt)) %>% 
        # mutate(tgt = fct_inorder(tgt)) %>%
        ggplot(aes(gt, perc_expand, color = gt)) + 
        geom_quasirandom(width = 0.2) + 
        geom_boxplot(outlier.shape = NA, width = 0.2) + 
        scale_color_brewer(palette = 'Set1') + 
        # scale_y_continuous(position = 'top') +
        # facet_wrap(~loc, nrow = 1, scales = 'free_y', strip.position = 'bottom') + 
        theme_half_open() +
        theme(axis.title.y = element_blank(),
              plot.title = element_text(hjust = 0.5, size = 10)) + 
        labs(y = '% expanded', title = 'Overall')
    p = plot_grid(p1, p2, nrow = 1, axis = 'tb', align = 'h', rel_widths = c(1, 0.7))
    p

    ggsave('test.pdf', p, w = 8, h = 3.5)

    # fig export
    p__bd_by_epoch = p

14 Compare models of %expanded ~ variant

  1. Variant is Atg10 variant or TE insertion or TE insertion + splice variant
    if (0) {
    gts_of_int %>% 
        filter(loc %in% c('te_insertion', 'msh3_splice_var1')) %>% 
        unnest(gt_data) %>%
        separate('gt', c('gta', 'gtb'), convert = TRUE, fill = 'right') %>%
        pivot_wider(id_cols = strain, names_from = loc, values_from = gta) %>%
        filter(te_insertion != msh3_splice_var1)
    }

    to_fit = gts_of_int %>% 
        filter(loc %in% c('te_insertion', 'msh3_splice_var1', 'atg10_variant')) %>% 
        unnest(gt_data) %>%
        separate('gt', c('gta', 'gtb'), convert = TRUE, fill = 'right') %>%
        select(loc, strain, gt = gta) %>%
        pivot_wider(id_cols = strain, names_from = loc, values_from = gt) %>%
        left_join(pxg %>% distinct(strain, perc_expand), by = 'strain') %>%
        filter(!is.na(te_insertion) & !is.na(atg10_variant) & !is.na(msh3_splice_var1))
    fits = list(
        atg10_var = lm(perc_expand ~ atg10_variant, data = to_fit),
        te_insert = lm(perc_expand ~ te_insertion, data = to_fit),
        te_and_split = lm(perc_expand ~ te_insertion + msh3_splice_var1, data = to_fit)
    )
    fits %>% map(~broom::tidy(.x))
## $atg10_var
## # A tibble: 2 x 5
##   term          estimate std.error statistic   p.value
##   <chr>            <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)     0.687    0.00649    106.   2.45e-133
## 2 atg10_variant  -0.0647   0.00996     -6.49 1.42e-  9
## 
## $te_insert
## # A tibble: 2 x 5
##   term         estimate std.error statistic   p.value
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)    0.686    0.00666    103.   1.26e-131
## 2 te_insertion  -0.0600   0.0101      -5.91 2.55e-  8
## 
## $te_and_split
## # A tibble: 3 x 5
##   term             estimate std.error statistic   p.value
##   <chr>               <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)        0.687    0.00661   104.    2.03e-131
## 2 te_insertion      -0.0127   0.0268     -0.474 6.36e-  1
## 3 msh3_splice_var1  -0.0513   0.0270     -1.90  5.94e-  2
    anova(fits$te_insert, fits$te_and_split)
    anova(fits$atg10_var, fits$te_and_split)

15 Check how probe aggregation affects view on differences in expression by haplotype

  1. Make sense that difference would be for top_qtl_probe, because this is by definition where the difference is detected
    # agg_type = 'top_qtl_probe'
    p_list = map(c('avg_expr_per_gene', 'top_expr_probe', 'top_qtl_probe'), function(agg_type) {
        # subset data
        .data = unq_expr_vals[[agg_type]] %>% 
            filter(gene_name == 'Msh3') %>%
            semi_join(eqtl_dsets_genes, by = c('GN', 'gene_name')) %>%
            # semi_join(qtl_dsets, by = 'GN') %>% # not necessary b/c eqtl_dsets_genes has this filtering already
            left_join(gts_of_int %>% filter(loc == 'qtl_peak') %>% unnest(gt_data) %>% select(strain, gt), by = 'strain')

        # run plot
        plt_avg_expr_per_fou_by_gn(.data)$p[[1]] + 
            coord_cartesian(ylim = c(-2, 2)) +
            labs(title = agg_type)
    })
    p = plot_grid(plotlist = p_list, nrow = 1)
    p

    ggsave('test.pdf', p, w = 10, h = 4)

16 Expression at peak LOD for top X eQTL genes + DNA repair genes

  1. Only dataset/gene pairs were eQTL signal is significant
  2. Only probes without snps
    gene_gt_df = unq_expr_vals[['top_qtl_probe']] %>%
        left_join(gene_ord %>% select(gene_name, Rank), by = 'gene_name') %>%
        filter(Rank != other_lvl | gene_name %in% dna_repair_genes) %>%
        mutate(gene_name = fct_reorder(gene_name, as.integer(Rank))) %>%
        arrange(gene_name) %>%
        semi_join(eqtl_dsets_genes, by = c('GN', 'gene_name')) %>%
        # semi_join(qtl_dsets, by = 'GN') %>% # not necessary b/c eqtl_dsets_genes has this filtering already
        left_join(gts_of_int %>% filter(loc == 'qtl_peak') %>% unnest(gt_data) %>% select(strain, gt), by = 'strain')
    p_list = gene_gt_df %>%
        plt_avg_expr_per_fou_by_gn
    p = plot_grid(plotlist = p_list$p, nrow = 3)
    p

    # ggsave('test.pdf', p, w = 18, h = 8)
    ggsave('test.pdf', p, w = 14, h = 10)

17 DNA repair gene expression by founder and by GN

  1. Only dataset/gene pairs were eQTL signal is significant

17.1 Detailed view by dataset

    # check
    # gene_gt_df %>% pull(strain) %>% unique
    p_list = gene_gt_df %>% 
        filter(gene_name %in% dna_repair_genes) %>% 
        plt_avg_expr_per_fou_by_gn

    # get rid of y-axis
    p_list = p_list %>%
        mutate(p = map(p, ~.x + theme(axis.title.y = element_blank())))

    # arrange
    # p = plot_grid(plotlist = p_list$p)
    p_top = (p_list %>% filter(panel_id == 'Ssbp2'))$p[[1]]
    p_bot = plot_grid(plotlist = (p_list %>% filter(panel_id != 'Ssbp2'))$p, nrow = 2)
    p = plot_grid(p_top, p_bot, ncol = 1, rel_heights = c(0.7, 1))
    p = plot_grid(ggdraw() + draw_text('Scaled gene expr.', x = 0.5, y = 0.5, size = 14, hjust = 0.5, vjust = 0.5, angle = 90),
                  p, nrow = 1, rel_widths = c(0.05, 1))
    p

    ggsave('test.pdf', p, w = 8, h = 8)

    # save figure
    ggsave(path(plot_dir, 'Suppl_Fig_13.pdf'), p, w = 8, h = 8)
    
    p__detailed_bd_by_gene = p

17.2 Overall “B” vs “D” differences

    if (0) { # test for Msh3
        # dataset ordering by difference b/w B and D
        lvls = gene_gt_df %>% 
            filter(gene_name %in% dna_repair_genes) %>% 
            # filter(loc == 'qtl_peak') %>%
            filter(gt %in% c('0/0', '1/1')) %>%
            mutate(gt = recode(gt, `0/0` = 'B', `1/1` = 'D')) %>%
            group_by(gene_name, GN, gt) %>%
            summarise(expr_val = mean(expr_val), .groups = 'drop') %>%
            pivot_wider(id_cols = c(gene_name, GN), names_from = gt, values_from = expr_val) %>%
            mutate(diff = B - D) %>%
            arrange(desc(diff))
        p = gene_gt_df %>%
            filter(gene_name %in% dna_repair_genes) %>% 
            group_by(GN, gene_name) %>%
            mutate(expr_val = scale(expr_val, center = TRUE, scale = FALSE)[,1]) %>%
            ungroup %>%
            filter(gene_name == 'Msh3') %>%
            filter(gt %in% c('0/0', '1/1')) %>%
            mutate(gt = recode(gt, `0/0` = 'B', `1/1` = 'D')) %>%
            mutate(GN = fct_relevel(GN, lvls %>% filter(gene_name == 'Msh3') %>% pull(GN) %>% as.character)) %>%
            ggplot(aes(GN, expr_val, color = gt)) +
            geom_boxplot() + 
            scale_color_brewer(palette = 'Set1') + 
            coord_cartesian(ylim = c(-1.5, 1.5)) +
            theme_half_open() + 
            theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
                  axis.title.x = element_blank(),
                  legend.position = 'none',
                  plot.title = element_text(size = 10, hjust = 0.5))
    } else { # full plot
        # dataset ordering by difference b/w B and D
        lvls = gene_gt_df %>% 
            filter(gene_name %in% dna_repair_genes) %>% 
            filter(gt %in% c('0/0', '1/1')) %>%
            mutate(gt = recode(gt, `0/0` = 'B', `1/1` = 'D')) %>%
            group_by(gene_name, gt) %>%
            summarise(expr_val = mean(expr_val), .groups = 'drop') %>%
            pivot_wider(id_cols = c(gene_name), names_from = gt, values_from = expr_val) %>%
            mutate(diff = B - D) %>%
            arrange(desc(diff)) 

        # make plot
        p = gene_gt_df %>%
            filter(gene_name %in% dna_repair_genes) %>% 
            group_by(GN, gene_name) %>%
            mutate(expr_val = scale(expr_val, center = TRUE, scale = FALSE)[,1]) %>%
            ungroup %>%
            filter(gt %in% c('0/0', '1/1')) %>%
            mutate(gt = recode(gt, `0/0` = 'B', `1/1` = 'D')) %>%
            # mutate(gene_name = fct_relevel(gene_name, lvls %>% pull(gene_name) %>% as.character)) %>%
            mutate(gene_name = fct_relevel(gene_name, c('Atg10', 'Ccnh', 'Msh3', 'Ssbp2', 'Xrcc4'))) %>%
            ggplot(aes(gt, expr_val, color = gt)) +
            geom_boxplot() + 
            # stat_compare_means(method = 't.test', comparisons = list(c('B', 'D'))) + 
            # stat_compare_means(aes(label = ..p.signif..), method = 't.test') + 
            scale_color_brewer(palette = 'Set1') + 
            facet_wrap(~gene_name, nrow = 1, scales = 'free_x') + 
            theme_half_open() + 
            theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
                  axis.title.x = element_blank(),
                  legend.position = 'none',
                  plot.title = element_text(size = 10, hjust = 0.5)) +
            labs(y = 'Scaled gene expression')
    }
    p

    ggsave('test.pdf', p, w = 10, h = 8)

    # fig export
    p__bd_gene_expr = p

18 Express corr heatmap

  1. All datasets
  2. Top X genes
  3. Expression values centered and scaled within every GN/gene pair
    # add color to chromosome
    make_lab = function(gene_name) {
        gene_chr = gene_ord %>% filter(gene_name == !!gene_name) %>% pull(mark_chr)
        gene_col = colorRampPalette(RColorBrewer::brewer.pal(8, "Dark2"))(19)[which(str_c('chr', 1:19) == gene_chr)]
        sprintf("<i style='color:%s'>(%s)</i> %s", gene_col, gene_chr, gene_name)
    }
    # highlight certain genes
    make_lab = function(gene_name) {
        top_genes = gene_ord %>% filter(Rank != other_lvl) %>% pull(gene_name)
        gene_col = if_else(gene_name %in% top_genes, 'red', 'blue')
        sprintf("<i style='color:%s'>%s</i>", gene_col, gene_name)
    }
    # highlight DNA repair genes each in their own color
    make_lab = function(gene_name) {
        # gene_col = c(Msh3  = '#1b9e77', Ssbp2 = '#d95f02', Xrcc4 = '#7570b3', Atg10 = '#e7298a')[gene_name]
        gene_col = gene_pal[gene_name]
        if (is.na(gene_col)) {
            gene_col = 'gray50' 
            sprintf("<i style='color:%s'>%s</i>", gene_col, gene_name)
        } else {
            sprintf("<i style='color:%s'><b>%s</b></i>", gene_col, gene_name)
        }
    }

    # gene order for heatmap
    hm_gene_ord = gene_ord %>%
        filter(gene_name %in% (gene_ord %>% filter(Rank != other_lvl | gene_name %in% dna_repair_genes) %>% pull(gene_name))) %>%
        arrange(gene_chr, gene_pos) %>%
        pull(gene_name) %>% as.character

    # make heatmaps
    # NOTE: make sure to center and scale expression values within every GN/gene pair
    # otherwise artificial correlations may be induced from GN-to-GN correlation structure
    covmat = unq_expr_vals[[c('top_qtl_probe', 'avg_expr_per_gene', 'top_expr_probe')[1]]] %>% 
        group_by(GN, gene_name) %>%
        mutate(expr_val = scale(expr_val, center = TRUE, scale = TRUE)[,1]) %>%
        ungroup %>%
        filter(gene_name %in% (gene_ord %>% filter(Rank != other_lvl | gene_name %in% dna_repair_genes) %>% pull(gene_name))) %>%
        pivot_wider(id_cols = c(GN, strain), names_from = gene_name, values_from = expr_val) %>%
        select(!c(GN, strain)) %>%
        cor(., method = 'pearson', use = 'pairwise.complete.obs')
    covmat = covmat[hm_gene_ord, hm_gene_ord]
    covmat[upper.tri(covmat, diag = TRUE)] = NA
    covmat = covmat %>%
        as.data.frame %>%
        rownames_to_column(var = 'gene.x') %>%
        mutate(gene.x = fct_inorder(gene.x)) %>%
        pivot_longer(!gene.x, names_to = 'gene.y', values_to = 'pears_cor') %>%
        mutate(gene.y = fct_relevel(gene.y, levels(gene.x))) %>%
        filter(!is.na(pears_cor))

    # simpler way to take correlation between every pair of genes within each dataset
    # this gives roughly similar value ranges
    if (0) {
        covmat = unq_expr_vals[[c('top_qtl_probe', 'avg_expr_per_gene', 'top_expr_probe')[1]]] %>% 
            unq_expr_vals %>% names
            nest(data = !GN) %>%
            group_by(GN) %>%
            summarise(covmat = map(data, function(.x) {
                covmat = .x %>%
                    filter(gene_name %in% (gene_ord %>% filter(Rank != other_lvl | gene_name %in% dna_repair_genes) %>% pull(gene_name))) %>%
                    pivot_wider(id_cols = strain, names_from = gene_name, values_from = expr_val) %>%
                    select(!strain) %>%
                    cor(., method = 'pearson', use = 'pairwise.complete.obs')
                # covmat = covmat[hm_gene_ord, hm_gene_ord]
                covmat[upper.tri(covmat, diag = TRUE)] = NA
                covmat = covmat %>%
                    as.data.frame %>%
                    rownames_to_column(var = 'gene.x') %>%
                    mutate(gene.x = fct_inorder(gene.x)) %>%
                    pivot_longer(!gene.x, names_to = 'gene.y', values_to = 'pears_cor') %>%
                    mutate(gene.y = fct_relevel(gene.y, levels(gene.x))) %>%
                    filter(!is.na(pears_cor))
                covmat
            }))
        covmat_avg = covmat %>% 
            unnest(covmat) %>%
            # filter(gene.x == 'Rasgrf2', gene.y == 'Cox7c') %>%
            # count(gene.x, gene.y)
            group_by(gene.x, gene.y) %>%
            summarise(avg_pears_cor = mean(pears_cor, na.rm = TRUE), 
                  stdev = sd(pears_cor, na.rm = TRUE), .groups = 'drop')
    }

    # make heatmap
    p = covmat %>%
        mutate(hlite = gene.x %in% dna_repair_genes & 
                       gene.y %in% dna_repair_genes) %>%
        # ggplot(aes(gene.x, gene.y, fill = avg_pears_cor)) +
        ggplot(aes(gene.x, gene.y, fill = pears_cor)) +
        geom_tile() + 
        geom_tile(data = ~.x %>% filter(hlite),
              color = 'gray50', fill = NA, size = 1) +
        # geom_text(aes(label = scales::number(avg_pears_cor, accuracy = 0.1)), size = 2) + 
        geom_text(aes(label = scales::number(pears_cor, accuracy = 0.01)), size = 3) + 
        # scale_fill_viridis_c(option = 'inferno') + 
        scale_fill_distiller(palette = 'RdBu', direction = -1, na.value = NA, limits = c(-0.5, 0.5)) + 
        scale_x_discrete(labels = function(x) map(x, make_lab)) + 
        scale_y_discrete(labels = function(x) map(x, make_lab)) + 
        theme_half_open() + 
        theme(
              axis.text.x = element_markdown(angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_markdown(),
              # axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
              # axis.text.y = element_text(),
              axis.title.x = element_blank(),
              axis.title.y = element_blank()
        )

    p

    ggsave('test.pdf', p, w = 8, h = 6.5)

    # this will be second panel of a supplementary figure
    # fig export
    p__gene_corr = p

19 REPRODUCTION: QTL mapping for all mutator phentypes

    phenos = c(
        '% denovo'           = 'denovo_perc_abundance',
        'delta (RU) expan'   = 'expand_delta_ru',
        'delta (RU) contr'   = 'contract_delta_ru',
        '% expanded'         = 'proportion_expanded'
    )

    p = final_res$qtl_res %>%
        # NOTE: don'te need the str_len phenotype anymore
        arrange(match(metric, names(phenos))) %>%
        mutate(delta_dir = if_else(str_detect(metric, ' (expan|contr)$'),
                                   str_replace(metric, '.* (expan|contr)$', '\\1'), 
                                   'all')) %>%
        mutate(metric = if_else(delta_dir != 'all', 
                                str_replace(metric, ' (expan|contr)$', ''),
                                metric)) %>%
        mutate(metric = fct_inorder(metric)) %>%
        # recode labels for final figure
        mutate(metric = recode(metric, !!!c('% denovo' = 'mutation\ncount', 
                                            'delta (RU)' = 'expansion\nsize', 
                                            '% expanded' = 'expansion\npropensity'))) %>%
        mutate(chr = str_replace(chr, 'chr', '')) %>%
        mutate(chr = fct_relevel(chr, str_sort(unique(chr), numeric = TRUE))) %>%
        # filter(metric %in% c('% denovo', '% expanded', 'delta (RU) expan')) %>%
        mutate(across(c(pos, end), ~.x/1e6)) %>%
        ggplot(aes(pos, LOD, color = delta_dir)) + 
        geom_step() +
        geom_hline(data = ~.x %>% distinct(metric, chr, lod_thresh, delta_dir),
               aes(yintercept = lod_thresh, color = delta_dir), linetype = 'dashed') + 
        facet_grid(metric~chr, scales = 'free_x', switch = 'x') + 
        scale_x_continuous(breaks = scales::breaks_pretty(n = 2), 
                           guide = guide_axis(angle = 60)) +
        scale_color_brewer(palette = 'Dark2', guide = guide_legend(title = NULL)) + 
        # coord_cartesian(ylim = c(0, 4)) +
        theme_half_open() +
        theme(
            panel.spacing.x = unit(0, 'pt'),
            axis.title.x = element_blank(),
            strip.placement = 'outside',
            strip.text.y = element_text(angle = 90),
            panel.border = element_rect(color = 'gray70'),
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            legend.position = 'top'
        )
    p

    ggsave('test.pdf', p, w = 8, h = 4)

    # fig export
    p__qtl_mapping = p

20 REPRODUCTION: locus zoom with genes under QTL

    genes_qtl_plot = function(qtl_data) {
        # manual color scale
        # coul = c(colorRampPalette(RColorBrewer::brewer.pal(9, "PRGn"))(top_x_genes), 'gray30')
        # confidence interval around QTL from 2_mutator_pheno_scan.Rmd using 1.5 LOD drop method
        gene_h = 0.1
        yl = 2.5
        xlims = c(ci_lo-3, ci_hi+3)

        p = gene_ord %>%
            arrange(gene_pos, gene_end) %>%
            mutate(y = (yl+0.1)+IRanges::disjointBins(IRanges::IRanges(gene_pos, gene_end))*gene_h*1.2) %>%
            mutate(hlite = if_else(gene_name %in% dna_repair_genes, gene_name, NA_character_)) %>%
            ggplot(aes(x = x, y = y, width = width, height = gene_h)) +
            geom_rect(aes(xmin = ci_lo, xmax = ci_hi, ymin = -Inf, ymax = Inf), fill = 'gray90') + 
            geom_step(data = qtl_data, aes(pos, LOD), inherit.aes = FALSE) + 
            geom_vline(xintercept = ci_mid, linetype = 'dashed') + 
            geom_text_repel(data = ~.x %>% filter(Rank != other_lvl | gene_name %in% dna_repair_genes),
                            aes(label = gene_name,
                                fontface = if_else(!is.na(hlite), 'bold', 'plain'),
                                color = gene_name
                            ),
                            force_pull   = 0, # do not pull toward data points
                            nudge_y      = 0,
                            direction    = "x",
                            angle        = 90,
                            hjust        = 0,
                            segment.size = 0.2,
                            ylim         = c(yl+1.5, NA),
                            max.overlaps = Inf
                            ) + 
            geom_tile(color = NA, alpha = 1) +
            coord_cartesian(xlim = xlims, ylim = c(yl, NA)) + 
            # scale_color_brewer(palette = 'Dark2', na.value = 'gray50') + 
            scale_color_manual(values = gene_pal) + 
            scale_x_continuous(breaks = scales::breaks_width(width = 2)) + 
            theme_half_open() + 
            theme(legend.position = 'none') +
            labs(x = 'Mb', y = 'LOD')
        p
    }

    p = final_res$qtl_res %>%
        filter(metric == '% expanded', chr == 'chr13') %>% 
        mutate(across(c(pos, end), ~.x/1e6)) %>% 
        genes_qtl_plot
    p

    ggsave('test.pdf', p, w = 8, h = 4)

    # fig export
    p__qtl_genes = p

21 REPRODUCTION: eQTL traces for DNA repair genes (for main figure)

    top_signals = eqtl_data$best_trace %>% 
        filter(gene_name %in% dna_repair_genes) %>%
        semi_join(eqtl_dsets_genes, by = c('GN', 'gene_name')) %>%
        # semi_join(qtl_dsets, by = 'GN') %>% # not necessary b/c eqtl_dsets_genes has this filtering already
        group_by(gene_name, GN) %>%
        mutate(maxLOD = max(LODadj)) %>%
        ungroup
    p = ggplot() +
        geom_step(data = top_signals,
                  # aes(mark_pos, LODadj, group = GN, alpha = maxLOD, color = gene_name)) +
                  aes(mark_pos, LODadj, group = GN, color = gene_name)) +
        geom_step(data = final_res$qtl_res %>%
                      filter(metric == '% expanded', chr == 'chr13') %>%
                      mutate(across(c(pos, end), ~.x/1e6)),
                  aes(pos, LOD), inherit.aes = FALSE) + 
        scale_color_brewer(palette = 'Dark2') + 
        scale_fill_brewer(palette = 'Dark2') + 
        scale_x_continuous(breaks = scales::breaks_pretty(), guide = guide_axis(check.overlap = TRUE)) + 
        facet_wrap(~gene_name, nrow = 1) + 
        theme_half_open() +
        theme(legend.position = 'none'
              # panel.spacing = unit(0.5, 'lines')
        ) +
        labs(x = 'Mb', y = 'LOD')
    p

    ggsave('test.pdf', p, w = 10, h = 4)
    
    # fig export
    p__eqtl_dna_repair = p

22 REPRODUCTION: QTL vs eQTL overlap

    # get top dataset per gene
    top_dset_per_gene = eqtl_data$best_trace %>% 
        semi_join(eqtl_dsets_genes, by = c('GN', 'gene_name')) %>%
        left_join(gene_ord %>% select(gene_name, gene_pos, Rank), by = 'gene_name') %>%
        arrange(desc(LODadj)) %>%
        distinct(gene_name, .keep_all = TRUE) %>%
        distinct(GN, gene_name)

    # xlims = c(0, 125)
    xlims = c(55, 110)
    to_plt = eqtl_data$best_trace %>% 
        # dense_traces$signal %>% left_join(dense_traces$markers, by = 'marker') %>%
        semi_join(eqtl_dsets_genes, by = c('GN', 'gene_name')) %>%
        left_join(gene_ord %>% select(gene_name, gene_pos, Rank), by = 'gene_name') %>%
        filter(Rank != other_lvl | gene_name %in% dna_repair_genes) %>%
        mutate(gene_name = fct_reorder(gene_name, gene_pos)) %>%
        semi_join(top_dset_per_gene, by = c('GN', 'gene_name')) %>%
        rename(pos = mark_pos)

    p1 = to_plt %>%
        # mutate(gene_name = fct_relevel(gene_name, genes_to_lab)) %>%
        ggplot(aes(pos, LODadj, color = gene_name)) + 
        geom_step() +
        scale_color_manual(values = gene_pal, guide = guide_legend(title = 'eQTL signal')) + 
        coord_cartesian(xlim = xlims) + 
        theme_half_open() + 
        theme(legend.position = 'right',
              legend.key.size = unit(5, 'pt')
        )
    p2 = final_res$qtl_res %>%
        filter(metric == '% expanded', chr == 'chr13') %>%
        mutate(across(c(pos, end), ~.x/1e6)) %>%
        ggplot(aes(pos, LOD, color = metric)) +  
        geom_step() +
        scale_color_manual(values = 'black', guide = guide_legend(title = 'QTL signal')) +
        coord_cartesian(xlim = xlims) + 
        theme_half_open() 
    p = plot_grid(p1, p2, ncol = 1, axis = 'lr', align = 'v', rel_heights = c(1, 0.7))
    p

    ggsave('test.pdf', p, w = 6, h = 5)

    # fig export
    p__eqtl_overlay = p

23 REPRODUCTION: colocalization

    # calculate correlation between signals, marker per marker
    p = signal_signal_cor %>%
        pivot_longer(!c(GN, gene_name), names_to = 'cor_type', values_to = 'qtl_eqtl_cor') %>%
        filter(cor_type == 'main_qtl_eqtl_cor') %>%
        ggplot(aes(gene_name, qtl_eqtl_cor)) + 
        geom_boxplot(width = 0.4) + 
        geom_quasirandom(aes(color = GN), groupOnX = TRUE) + 
        scale_color_viridis_d(guide = guide_legend(title = 'Dataset', ncol = 4)) + 
        # facet_wrap(~cor_type) +
        theme_half_open() + 
        theme(
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
            axis.title.x = element_blank(),
            legend.key.size = unit(3, 'pt')
        ) + 
        labs(y = 'QTL/eQTL score corr.')
    p

    ggsave('test.pdf', p, w = 6, h = 4)

    # test for significance by ANOVA
    signal_signal_cor %>%
        pivot_longer(!c(GN, gene_name), names_to = 'cor_type', values_to = 'qtl_eqtl_cor') %>%
        filter(cor_type == 'main_qtl_eqtl_cor') %>%
        aov(qtl_eqtl_cor ~ gene_name, data = .) %>%
        broom::tidy()
    # fig export
    p__coloc = p

24 Main figure 2

    p = plot_grid(
            plot_grid(p__bd_by_epoch, p__qtl_genes, labels = c('a', 'b'), rel_heights = c(1, 1), nrow = 1),
            p__qtl_mapping,
            ncol = 1, rel_heights = c(0.8, 1), labels = c('', 'c')
    )
    p
## Warning: ggrepel: 144 unlabeled data points (too many overlaps). Consider increasing
## max.overlaps

    w = 12; h = 8 
    ggsave('test.pdf', p, w = w, h = h)
## Warning: ggrepel: 144 unlabeled data points (too many overlaps). Consider increasing
## max.overlaps
    ggsave(path(plot_dir, 'Fig2.pdf'), p, w = w, h = h)
## Warning: ggrepel: 144 unlabeled data points (too many overlaps). Consider increasing
## max.overlaps

25 Main figure 4

    p = plot_grid(
            plot_grid(p__eqtl_dna_repair, p__bd_gene_expr, nrow = 1, labels = c('a', 'b')),
            p__coloc + theme(plot.margin = margin(l = 0.1, b = 0.1, t = 0.1, r = 3, unit = 'cm')), 
            ncol = 1, labels = c('', 'c'), vjust = -1
    )
    p

    w = 10; h = 6.5 
    ggsave('test.pdf', p, w = w, h = h)
    ggsave(path(plot_dir, 'Fig4.pdf'), p, w = w, h = h)